Online Retail Clean merupakan sebuah perusahaan jasa retail, data set dari data transaksi berisi data customer ID, Frequency dan Monetary
Cutomer ID : adalah ID unik yang dimiliki oleh masing-masing pelanggan
Recency : adalah jumlah hari dari hari terakhir customer membeli ( satuan hari) Frequency : adalah jumlah pembelian yang dilakukan oleh customer ( satuan kali) Monetary : adalah total nilai pembelian dari customer ( satuan Dollar)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(cluster)
library(clValid)
x<-read.csv('https://raw.githubusercontent.com/arikunco/machinelearning/master/dataset/online_retail_clean.csv')
print(summary(x))
## CustomerID recency frequency monetary
## Min. :12347 Min. : 21.47 Min. : 1.000 Min. :-38970.00
## 1st Qu.:13752 1st Qu.: 47.43 1st Qu.: 1.000 1st Qu.: 13.20
## Median :15249 Median : 92.39 Median : 1.000 Median : 26.10
## Mean :15270 Mean :133.05 Mean : 2.259 Mean : 52.63
## 3rd Qu.:16792 3rd Qu.:203.47 3rd Qu.: 2.000 3rd Qu.: 56.73
## Max. :18283 Max. :394.51 Max. :86.000 Max. : 10122.56
Gunakan R sudio Desktop dengan OS Windows atau gunakan R Server dengan Google Chrome Browser
p <- plot_ly(x, x = ~recency, y = ~frequency, z = ~monetary) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Recency'),
yaxis = list(title = 'Frequency'),
zaxis = list(title = 'Monetary')))
p
Dari Summary data dan dari plot data terihat bahwa terdapat data monetary dengan nilai <0 secara logika tidak mungkin jumlah pembelian <0 maka data ini di anggap data invalid yang harus dibuang selain data NA juga harus dibuang dari data set
x<-filter(x, monetary >= 0) %>% na.omit(x)
print(summary(x))
## CustomerID recency frequency monetary
## Min. :12347 Min. : 21.47 Min. : 1.000 Min. : 0.00
## 1st Qu.:13752 1st Qu.: 47.35 1st Qu.: 1.000 1st Qu.: 13.80
## Median :15252 Median : 92.22 Median : 1.000 Median : 27.09
## Mean :15270 Mean :132.11 Mean : 2.267 Mean : 71.20
## 3rd Qu.:16790 3rd Qu.:200.60 3rd Qu.: 2.000 3rd Qu.: 57.50
## Max. :18283 Max. :394.51 Max. :86.000 Max. :10122.56
Pada metode cluster sangat dipengaruhi jarak antar point pada setiap variable dikarenakan setiap variable memiliki satuan dan skala yang berbeda, maka perlu dilakukan sclaling dari setiap nilai pada variable agar memiliki skala yang sama scale dilakukan dengan menghitung z score dari masing2 nilai variable
x.scale <- as.data.frame(scale(x, scale = TRUE))
head(x.scale)
## CustomerID recency frequency monetary
## 1 -1.48786478 -1.062717 -0.07696631 -0.008361669
## 2 0.02366284 -1.062383 9.71540290 3.015655286
## 3 -0.47328562 -1.062210 0.21104455 -0.157988362
## 4 -1.58093490 -1.061669 0.49905541 0.158692626
## 5 -0.47615815 -1.061229 -0.36497716 -0.035219272
## 6 -0.79673300 -1.055713 0.21104455 -0.114150785
summary(x.scale)
## CustomerID recency frequency
## Min. :-1.679176 Min. :-1.0627 Min. :-0.36498
## 1st Qu.:-0.872281 1st Qu.:-0.8141 1st Qu.:-0.36498
## Median :-0.009946 Median :-0.3831 Median :-0.36498
## Mean : 0.000000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.873071 3rd Qu.: 0.6579 3rd Qu.:-0.07697
## Max. : 1.731097 Max. : 2.5204 Max. :24.11595
## monetary
## Min. :-0.21248
## 1st Qu.:-0.17128
## Median :-0.13162
## Mean : 0.00000
## 3rd Qu.:-0.04089
## Max. :29.99504
p <- plot_ly(x.scale, x = ~recency, y = ~frequency, z = ~monetary) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Recency'),
yaxis = list(title = 'Frequency'),
zaxis = list(title = 'Monetary')))
p
Mencari jumlah cluster optimal
fviz_nbclust(x.scale[,2:4], kmeans, method = "wss") +
geom_vline(xintercept = 6, linetype = 2)+
labs(subtitle = "Elbow method")
fviz_nbclust(x.scale[,2:4], kmeans, method = "silhouette")+
labs(subtitle = "Silhouette method")
km.out <- kmeans(x.scale[,2:4], center=4, nstart=10)
x.scale$cluster <- km.out$cluster
x$cluster <- km.out$cluster
plot(x.scale[,2:4], col = x.scale$cluster,
main = "K-MEANS of RFM")
p <- plot_ly(x.scale, x = ~recency, y = ~frequency, z = ~monetary, color = ~cluster) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Recency'),
yaxis = list(title = 'Frequency'),
zaxis = list(title = 'Monetary')))
p
fviz_nbclust(x.scale[,2:4], pam, method = "silhouette")+
theme_classic()
pam.out<-pam(x.scale[,2:4], 2, metric = "euclidean", stand = FALSE)
x.scale$clusterpam <-pam.out$clustering
head(x.scale)
## CustomerID recency frequency monetary cluster clusterpam
## 1 -1.48786478 -1.062717 -0.07696631 -0.008361669 3 1
## 2 0.02366284 -1.062383 9.71540290 3.015655286 1 1
## 3 -0.47328562 -1.062210 0.21104455 -0.157988362 3 1
## 4 -1.58093490 -1.061669 0.49905541 0.158692626 3 1
## 5 -0.47615815 -1.061229 -0.36497716 -0.035219272 3 1
## 6 -0.79673300 -1.055713 0.21104455 -0.114150785 3 1
plot(x.scale[,2:4], col = x.scale$clusterpam,
main = "K-MEDOIDS of RFM")
p <- plot_ly(x.scale, x = ~recency, y = ~frequency, z = ~monetary, color = ~clusterpam) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Recency'),
yaxis = list(title = 'Frequency'),
zaxis = list(title = 'Monetary')))
p
res.dist <- dist(x.scale[,2:4], method = "euclidean")
res.hc <- hclust(d = res.dist, method = "complete")
summary(res.hc)
## Length Class Mode
## merge 4698 -none- numeric
## height 2349 -none- numeric
## order 2350 -none- numeric
## labels 2350 -none- character
## method 1 -none- character
## call 3 -none- call
## dist.method 1 -none- character
plot(res.hc)
clmethods <- c("hierarchical","kmeans","pam")
intr <- clValid(x.scale[,2:4], nClust = 2:6, clMethods = clmethods,validation = "internal" ,maxitems = 2350,metric = "euclidean",method = "complete")
summary(intr)
##
## Clustering Methods:
## hierarchical kmeans pam
##
## Cluster sizes:
## 2 3 4 5 6
##
## Validation Measures:
## 2 3 4 5 6
##
## hierarchical Connectivity 3.8579 10.4567 13.3151 22.8099 25.0266
## Dunn 0.5741 0.2713 0.2808 0.2056 0.2504
## Silhouette 0.9478 0.9077 0.8763 0.8311 0.8039
## kmeans Connectivity 14.4048 12.6524 65.1853 25.7480 102.0024
## Dunn 0.0611 0.1926 0.0005 0.0891 0.0017
## Silhouette 0.9111 0.9066 0.5582 0.7778 0.5400
## pam Connectivity 48.7032 79.6806 99.9222 156.5710 161.3635
## Dunn 0.0003 0.0005 0.0003 0.0003 0.0002
## Silhouette 0.5291 0.3844 0.4448 0.3302 0.3058
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 3.8579 hierarchical 2
## Dunn 0.5741 hierarchical 2
## Silhouette 0.9478 hierarchical 2
optimalScores(intr)
## Score Method Clusters
## Connectivity 3.8579365 hierarchical 2
## Dunn 0.5741036 hierarchical 2
## Silhouette 0.9477798 hierarchical 2
####Diketahui algoritma optimal adalah Hirarki Cluster dengan jumlah cluster = 2
hc.out <- cutree(res.hc, k=2)
x.scale$clusterhc <- hc.out
head(x.scale)
## CustomerID recency frequency monetary cluster clusterpam
## 1 -1.48786478 -1.062717 -0.07696631 -0.008361669 3 1
## 2 0.02366284 -1.062383 9.71540290 3.015655286 1 1
## 3 -0.47328562 -1.062210 0.21104455 -0.157988362 3 1
## 4 -1.58093490 -1.061669 0.49905541 0.158692626 3 1
## 5 -0.47615815 -1.061229 -0.36497716 -0.035219272 3 1
## 6 -0.79673300 -1.055713 0.21104455 -0.114150785 3 1
## clusterhc
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
plot(x.scale[,2:4], col = x.scale$clusterhc,
main = "HC of RFM")
p <- plot_ly(x.scale, x = ~recency, y = ~frequency, z = ~monetary, color = ~clusterhc) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Recency'),
yaxis = list(title = 'Frequency'),
zaxis = list(title = 'Monetary')))
p
Bila dilihat kluster kedua merupakan outlier maka dilakukan perhitungan ulang dengan membuang kluster kedua
x.scale2<-filter(x.scale, monetary <= 16)
intr <- clValid(x.scale2[,2:4], nClust = 2:6, clMethods = clmethods,validation = "internal", maxitems = 2350 ,metric = "euclidean",method = "complete")
## Warning in clValid(x.scale2[, 2:4], nClust = 2:6, clMethods = clmethods, :
## rownames for data not specified, using 1:nrow(data)
summary(intr)
##
## Clustering Methods:
## hierarchical kmeans pam
##
## Cluster sizes:
## 2 3 4 5 6
##
## Validation Measures:
## 2 3 4 5 6
##
## hierarchical Connectivity 6.5988 9.4571 18.9520 21.1687 29.8460
## Dunn 0.2713 0.2808 0.2056 0.2504 0.0409
## Silhouette 0.9077 0.8763 0.8311 0.8039 0.6298
## kmeans Connectivity 8.7944 61.3274 21.8901 98.1444 87.8675
## Dunn 0.1926 0.0005 0.0891 0.0017 0.0020
## Silhouette 0.9067 0.5580 0.7778 0.5398 0.5464
## pam Connectivity 48.2032 79.1806 99.4222 156.0710 160.8635
## Dunn 0.0004 0.0007 0.0004 0.0004 0.0003
## Silhouette 0.5391 0.4009 0.4555 0.3418 0.3174
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 6.5988 hierarchical 2
## Dunn 0.2808 hierarchical 3
## Silhouette 0.9077 hierarchical 2
optimalScores(intr)
## Score Method Clusters
## Connectivity 6.5988095 hierarchical 2
## Dunn 0.2808187 hierarchical 3
## Silhouette 0.9077082 hierarchical 2
res.dist <- dist(x.scale2[,2:4], method = "euclidean")
res.hc <- hclust(d = res.dist, method = "complete")
hc.out <- cutree(res.hc, k=3)
x.scale2$clusterhc <- hc.out
head(x.scale2)
## CustomerID recency frequency monetary cluster clusterpam
## 1 -1.48786478 -1.062717 -0.07696631 -0.008361669 3 1
## 2 0.02366284 -1.062383 9.71540290 3.015655286 1 1
## 3 -0.47328562 -1.062210 0.21104455 -0.157988362 3 1
## 4 -1.58093490 -1.061669 0.49905541 0.158692626 3 1
## 5 -0.47615815 -1.061229 -0.36497716 -0.035219272 3 1
## 6 -0.79673300 -1.055713 0.21104455 -0.114150785 3 1
## clusterhc
## 1 1
## 2 2
## 3 1
## 4 1
## 5 1
## 6 1
plot(x.scale2[,2:4], col = x.scale$clusterhc,
main = "HC of RFM")
p <- plot_ly(x.scale2, x = ~recency, y = ~frequency, z = ~monetary, color = ~clusterhc) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Recency'),
yaxis = list(title = 'Frequency'),
zaxis = list(title = 'Monetary')))
p